/*	This program creates a working dataset for main analysis sample centered
at the month of 50th birthday*/


***** Set directories 
local dir_raw 		"~/Dropbox/Retirement gaming/raw"
local dir_clean 	"~/Dropbox/Retirement gaming/clean"
local dir_output 	"~/Dropbox/Retirement gaming/output/dataverse"

local dataname "mainsample_age50.dta" 


use "`dir_clean'/admindata.dta", clear

order i t j 
sort i t j


*** MONTH OF OBSERVATION CENTERED 
*birthdate
format Fnac %td
g birth_month=mofd(Fnac)
format birth_month %tm
g refbday_month=birth_month+(12*50)
format refbday_month %tm
* age in months centered at 50's birthday
g agemonths_centered = t-refbday_month // this is number of months after ref age birthday
g agemonths=agemonths_centered+12*50
* age in years centered at 50
g ageyears=agemonths/12
cap drop age
rename ageyears age


** TIME OF SERVICE VARIABLES **

* Time observed since start of data 
sort i t j
cap drop aux
bys i t: g aux=_n==1 // marks 1 obs per individual and time
sort i t j
bys i: g served_mthscum = sum(aux)
drop aux
* Time registered before start of sample (since start at first observed job until just before sample starts)
g tjobstart = mofd(Fing)
cap drop aux
g aux = (t-tjobstart) if t==tm(1996m4) // for people employed at start of dataset, how long they have been employed
replace aux=0 if aux==.
bys i: egen served_mthspredata = max(aux) // count reported months of tenure pre 1996
drop aux tjobstart
* Sum of time observed and pre-data
g total_mthscum = served_mthscum + served_mthspredata

* Served months by age 50 (we do not know jobs previous to start of sample)
foreach X in served_mthscum total_mthscum {
	cap drop aux
	g aux=`X' if agemonths_centered<=0
	bys i: egen `X'_by0 = max(aux)
}
** 1) missings: too old at start of data, and employed when data starts
cap drop aux
g aux=total_mthscum_by0==.
bys i: egen anymiss=max(aux)
sort i t j
//min age observed and time they enter the sample
cap drop aux
bys i: egen aux=min(agemonths_centered) if anymiss==1
bys i: egen auxt=min(t) if anymiss==1
format auxt %tm
g auxadd=served_mthspredata - agemonths_centered +1 if anymiss==1 & auxt==tm(1996m4) & served_mthscum==1
replace auxadd=0 if auxadd<0
replace auxadd=0 if auxadd==.
bys i: egen auxaddi=max(auxadd)
replace total_mthscum_by0 = auxaddi if anymiss==1 & auxt==tm(1996m4) 	
	// for people who enter the sample at the beginning but are older, impute time served by age 50 as time served up 
	// to data start, minus months since turned 50 
drop auxadd auxaddi anymiss aux auxt
sum total_mthscum_by0
** 2) censoring: too young at end of data
cap drop aux
bys i: egen aux=max(agemonths_centered) 
bys i: egen auxt=max(t) 
format auxt %tm
g auxyoung= auxt==tm(2016m6) & aux<0 // indicates we observe them last at end of sample (censoring) and they are younger than 0
// impute the months they have left until turning 50, assuming they will work that time
replace total_mthscum_by0 = total_mthscum_by0 - agemonths_centered if auxyoung==1


****** Keep industria & comercio
drop aportaci_min 
cap drop aux
gen aux=aportaci!=1
bys i: egen anynotic=max(aux) // this flags those that have obs dropped because not in industria y comercio
drop if aportaci!=1
drop aux

		
*** Gender in sample
drop if sexo!=1 // drop females

*** Identify and drop people in early retirement regimes
drop if vf_min==103 // these are people working while already retired
drop if vf_min==97 | vf_min==98 // people reported with no service to the firm


****** RETIREMENT AGE
g tegr=mofd(Fegr)
format tegr %tm
g retirnow=(t==tegr & causal_5==1) // reported as leaving due to retirement
tab retirnow
* keep only first retirement obs
cap drop aux
g aux=t if retirnow==1
bysort i: egen tretir=min(aux)
format tretir %tm
drop aux
g postretir=0
replace postretir=1 if t>tretir
replace retirnow=0 if postretir==1
bysort i: egen retir_insample=max(retirnow)
tab retir_insample postretir


*** Drop people with employement after retirement
g flag= W>0 & W<. & postretir==1
bysort i: egen iflag=max(flag)
drop if iflag==1
drop flag iflag


g agedisc=round(age)


*SELF-EMPLOYED
g self_empl=status_1==1 

*EMPLOYED
g empl=status_3==1 

* KEEP SELF-EMPLOYED AND EMPLOYED OBSERVATIONS  
keep if self_empl==1 | empl==1

* Drop if no earnings reported
drop if W==.
drop if W==0 // this drops periods with paid leave also

* Keep only salaried work observations
g sample_salary = tipREM_min==1 // monthly salary  
keep if sample_salary==1

*** Drop earnings outliers 
sum W, det 
g wlow=W<r(p5) // 3.604606
g whigh=W>r(p95) // 81.31013
bys i: egen anylow=max(wlow)
bys i: egen anyhigh=max(whigh)
g anywout= anylow==1 | anyhigh==1
drop if wlow==1 | whigh==1
drop wlow whigh anylow anyhigh


*** Firm size
bysort self_empl: sum ndep
*categorical
label define size_cat 0 "Micro less than 5 " 1 "Micro 5-9" 2 "Small 10-19" 3 "Small 20-49" 4 "Medium 50-249" 5 "Large 250 plus"
foreach X in ndep {
	g 		`X'_cat=0 if `X' <5 // micro 1 less than 5
	replace `X'_cat=1 if `X'>=5 & `X'<10 // micro 2
	replace `X'_cat=2 if `X'>=10 & `X'<20 // small 1
	replace `X'_cat=3 if `X'>=20 & `X'<50 // small 2
	replace `X'_cat=4 if `X'>=50 & `X'<250 // medium
	replace `X'_cat=5 if `X'>=250  // large
	label values `X'_cat size_cat
}
tab ndep_cat self_empl 

cap drop year
g year=yofd(dofm(t))


*** Keep only main job and drop duplicates
duplicates tag i t, g(tag)
tab tag sample_salary
bys i t: egen maxsal=max(sample_salary) 
drop if tag>0 & maxsal==1 & sample_salary==0
drop tag maxsal
duplicates tag i t, g(tag)
bysort i t: egen maxrem=max(W)
drop if tag>0 & W<maxrem
drop tag
drop maxrem
duplicates tag i t, g(tag)
tab tag
sort i t
duplicates drop i t W, force
drop tag  


*** Plot age-earnings profile (Figure 1c)
preserve
keep if  age>=45 & age<58 
*Wages conditional on reporting
g group=0 if empl==1 
replace group=1 if empl==1 & ndep_cat<=1
replace group=2 if self_empl==1
label define group 0 "Employees large" 1 "Employees small" 2 "Self-employed"
label values group group
g age_d = floor(age)
replace age_d=.  if age_d<45 | age_d>57
replace age_d=. if year<2005 // use data from 2005 onwards
bysort group age_d: egen m_W=mean(W)
bysort group age_d: egen m_age=mean(age_d)
bysort group age_d: g ok=(_n==1)
forvalues s=0/2 {
	sum W if age_d==49  & group==`s'
	local m=r(mean)
	replace m_W=m_W/`m' if group==`s'
}
label var m_W "Earnings" 
label var age_d "Age"
twoway line m_W age_d if ok==1 & group==0 & age_d>=45 & age_d<=57,  lpattern(dash) ///
	|| line m_W age_d if ok==1 & group==1 & age_d>=45 & age_d<=57,  lpattern(dash_dot) ///
	|| line m_W age_d if ok==1 & group==2 & age_d>=45 & age_d<=57,  lpattern(solid) ///
	legend(position(0) bplacement(nwest) order(1 "Employees (Firms w/10+ workers)" 2  "Employees (Firms w/<10 workers)" 3 "Self-employed") cols(1)) ///
	scheme(s1mono) ///
	xscale(range(45 57)) xlabel(46(2)57) xline(49) xtitle("Age") ///
	yscale(range(.96 1.1)) ylabel(.96(.04) 1.1, grid)   ytitle("Earnings (1=average at 49 years old)")
 graph export "`dir_output'/figure1c.png",  replace 	
drop ok m_W m_age
drop group
drop age_d
restore


*JOB CHANGES
xtset i t
sort i t
cap drop auxj
bys i: gen auxj=l1.j
sort i t
replace auxj=j[_n-1] if auxj==. & i[_n]==i[_n-1]
g jobchange=auxj!=j if auxj!=.
bys i agedisc: egen jobchange_atage=max(jobchange) 
replace jobchange_atage=. if jobchange==.
bys i: egen anyjobchange=max(jobchange) 


*** AGE CENTERED AT 50 
g age_centered=.
forvalues y = -5(1)7 {
	local min=`y'*12
	local max=(`y'+1)*12
	replace age_centered=`y' if agemonths_centered>=`min' & agemonths_centered<`max'
}

*** 2-digit CIIU
cap drop aux
tostring ciiu, g(aux)
g ciiu2=substr(aux,1,2)
destring ciiu2, replace
drop aux


*****************************************************************************
*** SAMPLE RESTRICTIONS ***
*****************************************************************************

*** Cohorts in sample
g cohort=yofd(Fnac)
keep if birth_month>=tm(1941m4) & birth_month<tm(1971m4)

*** Keep observations in the relevant interval around age 50
keep if agemonths_centered>=-60 & agemonths_centered<96

* Drop individuals with simultaneous employment and self-employment
foreach X in self_empl empl {  
	bysort i t: egen `X'_mth=max(`X')
	bysort i: egen `X'_any=max(`X')
}
g inboth=self_empl_mth==1&empl_mth==1
bysort i: egen iinboth=max(inboth)
tab iinboth // this indicates that a person is simultaneously employed and self-employed at some time in the sample period
drop if iinboth==1 // drop individuals who at some point are simultaneously employed and self-employed
drop inboth iinboth

* Determine each person's max number of months in sample (depending on cohort)
local firstyrfull=1996-(50+-5)
local lastyrfull=2016-(50+7)+1

g max_months=156
replace max_months=156 - ( tm(`firstyrfull'm4)-birth_month  ) if birth_month < tm(`firstyrfull'm4)
replace max_months=156 - ( birth_month - tm(`lastyrfull'm3)) if birth_month > tm(`lastyrfull'm3)
tab max_months

* Select sample
foreach X in empl self_empl {
	* individuals observed employed/self_employed for at least 6 months overall
	bys i : egen count_m`X' = total(`X') 
	g prop_m`X'= count_m`X'/max_months 
	*Sample self_empl or empl
	g sample_`X'=`X'==1 & (count_m`X'>=6) 
	* Create isample to identify *individuals* who have observations in each sample
	bys i: egen isample_`X' = max(sample_`X')
} // sample_X indicates observations of self/empl for people with that condition at least 6 months at age 50

* DROP from sample of empl those who ever have self_employment
foreach X in empl self_empl {
	bysort i: egen `X'_any2=max(`X')
}
drop if isample_empl==1 & self_empl_any2==1

tab isample_empl sample_empl, m

sort i t
bysort i: g order=_n
count if isample_empl==1 & order==1
count if isample_self_empl==1 & order==1

* Drop people in none of the samples
drop if isample_empl!=1 & isample_self_empl!=1 
 
tab  isample_self_empl isample_empl

bys i: egen maxself=max(self_empl)
bys i: egen maxempl=max(empl)


cap drop order
sort i t
bysort i: g order=_n

drop if maxself==1 & maxempl==1 
drop maxself maxempl

cap drop aux
g aux=1
bys i : egen count_anyobs = total(aux) 
sum count_anyobs if sample_self_empl==1


*** Drop firm size outliers 
bys i: egen maxsize=max(ndep)
g anyfsizeout = maxsize >= 5000
drop if ndep>=5000
drop maxsize

 
* Keep relevant variables
keep i t j year served_mthscum* served_mthspredata* total_mthscum* age age_centered agemonths_centered  *empl* W* remC1_sum remC2_sum remC3_sum amt_* ben ciiu*  Tipocontr ipc  ndep ndep_cat *sample* birth_month *count* cohort max_months Fing Fegr hrsmonth trem_*_max status_*_max *jobchange* any*


save "`dir_clean'/`dataname'", replace

*Add MCB
tempfile time
use "`dir_clean'/`dataname'", clear
keep t 
duplicates drop 
save `time'

import delimited "`dir_raw'/FICTO UNIPERSONALES.csv", clear rowrange(5) varnames(5)
foreach var in valorbfc mgravado aportebps{
	destring `var', replace ignore(",")
}	
g t=mofd(date(fvigencia, "MY",2019))
format t %tm
sort t
merge 1:1 t using `time'
sort t
tset t
foreach var in valorbfc mgravado aportebps{
	replace `var' = l.`var' if `var'==.
}	
sort t
foreach var in valorbfc mgravado aportebps{
	replace `var' = `var'[1] if _n==2
}	
drop if _m==1
drop _m fvigencia

merge 1:m t using "`dir_clean'/`dataname'"
drop _m

g rficto = mgravado/(1000*ipc)
label var rficto "Min contribution base in 1000 Pesos of 2015"

save "`dir_clean'/`dataname'", replace

clear all
exit


